Устанавливаем необходимые библиотки
library(tidyverse)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(clusterProfiler)
library(biomaRt)
library(org.Hs.eg.db)
library(EnhancedVolcano)
library(GenomicRanges)
library(msigdbr)
library(multiMiR)
library(miRBaseConverter)
library(enrichplot)
library(vsn)
library(rvest)
library(patchwork)
library(dbplyr)
coldata <- read_tsv("data/phenotable.tsv", show_col_types = FALSE)
rownames(coldata) <- coldata$sample
Warning: Setting row names on a tibble is deprecated.
coldata
counts <- read.csv("data/miR.Counts.csv", header = TRUE, sep = ",")
counts <- column_to_rownames(counts, var = "miRNA")
head(counts)
colnames(counts) <- gsub("^X", "", colnames(counts))
counts_samples <- colnames(counts)
phenotable_samples <- coldata$sample
common_samples <- intersect(counts_samples, phenotable_samples)
counts <- counts[, c(counts$miRNA, common_samples)]
counts <- counts[, rownames(coldata)] #ранжирую по колонки в counts так же как и названия строк в coldata
head(counts)
anno <- read.csv("data/annotation.report.csv", header = TRUE, sep = ",")
anno$Sample.name.s. <- gsub("-", ".", anno$Sample.name.s.)
anno <- anno[, -c(2:5, 7, 15)]
common_samples <- intersect(anno$Sample.name.s., coldata$sample)
anno <- anno[anno$Sample.name.s. %in% common_samples, ]
anno <- anno[match(rownames(coldata), anno$Sample.name.s.), ] #ранжирую по колонки в counts так же как и названия строк в coldata
anno
anno_long <- anno %>%
pivot_longer(cols = -Sample.name.s., names_to = "RNA_Type", values_to = "Count")
plt <- ggplot(anno_long, aes(x = Sample.name.s., y = Count, fill = RNA_Type)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(x = "Sample", y = "Read Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Set3") # Красивые цвета
print(plt)
ggsave("./pictures_transpl/transpl_barplot_alldataset_no_normalised.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
anno_long <- anno %>%
rowwise() %>%
mutate(across(-Sample.name.s., ~ . / sum(c_across(-Sample.name.s.)))) %>%
ungroup() %>%
pivot_longer(cols = -Sample.name.s., names_to = "RNA_Type", values_to = "Proportion")
plt <- ggplot(anno_long, aes(x = Sample.name.s., y = Proportion, fill = RNA_Type)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(x = "Sample", y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Set3")
plt
ggsave("./pictures_transpl/transpl_barplot_alldataset_normalised.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
coldata$condition <- relevel(factor(coldata$condition), ref = "no_complications")
modelMatrix <- model.matrix(~condition, coldata)
modelMatrix
(Intercept) conditioncellular conditionhumoral conditionTCAD
1 1 0 0 0
2 1 0 0 0
3 1 0 1 0
4 1 1 0 0
5 1 0 0 1
6 1 0 0 1
7 1 0 0 1
8 1 0 0 0
9 1 0 0 0
10 1 0 0 0
11 1 0 0 0
12 1 0 1 0
13 1 0 1 0
14 1 1 0 0
15 1 1 0 0
16 1 1 0 0
17 1 1 0 0
18 1 1 0 0
19 1 0 0 1
20 1 0 0 1
21 1 0 1 0
22 1 0 0 1
23 1 0 0 1
24 1 0 1 0
25 1 0 1 0
26 1 0 0 0
27 1 0 0 0
28 1 0 1 0
29 1 0 0 0
attr(,"assign")
[1] 0 1 1 1
attr(,"contrasts")
attr(,"contrasts")$condition
[1] "contr.treatment"
qr(modelMatrix)$rank # ранг матрицы
[1] 4
ncol(modelMatrix)
[1] 4
dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ condition)
converting counts to integer mode
dds$condition <- relevel(dds$condition, ref = "no_complications")
dds
class: DESeqDataSet
dim: 913 29
metadata(1): version
assays(1): counts
rownames(913): Hsa-Let-7-P1a_3p* Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p ... Hsa-Mir-9851_3p Hsa-Mir-9851_5p*
rowData names(0):
colnames(29): 105_S1_R1_001 197_S2_R1_001 ... 127_S29_R1_001 138_S30_R1_001
colData names(2): sample condition
dim(dds)
[1] 913 29
smallestGroupSize <- 15
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
dim(dds)
[1] 297 29
dds <- DESeq(dds, fitType = "parametric")
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 31 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
dds
class: DESeqDataSet
dim: 297 29
metadata(1): version
assays(6): counts mu ... replaceCounts replaceCooks
rownames(297): Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p Hsa-Let-7-P1b_5p ... Hsa-Mir-96-P3_3p* Hsa-Mir-96-P3_5p
rowData names(31): baseMean baseVar ... maxCooks replace
colnames(29): 105_S1_R1_001 197_S2_R1_001 ... 127_S29_R1_001 138_S30_R1_001
colData names(4): sample condition sizeFactor replaceable
plotDispEsts(dds)
raw_counts <- counts(dds, normalized = FALSE)
normalized_counts <- counts(dds, normalized = TRUE)
df <- data.frame(
Sample = rep(colnames(dds), 2),
Counts = c(colSums(raw_counts), colSums(normalized_counts)),
Type = rep(c("Raw", "Normalized"), each = ncol(dds))
)
plt <- ggplot(df, aes(x = Sample, y = Counts, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Counts before and after normalization", x = "Sample", y = "Total Counts") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plt
ggsave("./pictures_transpl/transpl_Counts before and after normalization.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
rlog трансформация
rlt <- rlog(dds) #rlog Transformation
meanSdPlot(assay(rlt))
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
meanSdPlot(assay(vsd)) #показывает, как изменяется стандартное отклонение в зависимости от среднего значения экспрессии
** PCA plot **
pcaData <- plotPCA(rlt, intgroup=c("condition"), returnData = TRUE)
using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
pcaData$sample <- gsub("_.*", "", coldata$sample)
plt <- ggplot(pcaData, aes(PC1, PC2, color = condition)) +
geom_text(aes(label=sample), size=3, vjust=1.5) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "%")) +
ylab(paste0("PC2: ", percentVar[2], "%")) +
coord_fixed() +
theme_bw() +
scale_color_brewer(palette = "Set2")
plt
ggsave("./pictures_transpl/transpl_PCA plot.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Plot a heatmap of 50 most expressed genes Этот heatmap отражает уровни экспрессии генов, а не разницу между группами. Цвета не означают up- или down-регуляцию в сравнении с контрольной группой, потому что heatmap показывает абсолютные значения экспрессии, а не fold change!
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
df <- as.data.frame(colData(dds)$condition)
colnames(df) <- "condition"
rownames(df) <- colnames(counts(dds))
plt <- pheatmap(assay(rlt)[select,],
cluster_rows = TRUE,
show_rownames = TRUE,
cluster_cols = TRUE,
annotation_col = df,
fontsize_row = 6)
plt
ggsave("./pictures_transpl/transpl_Plot a heatmap of 50 most expressed genes.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Plot of the distance between samples heatmap Расчет расстояний между образцами • Обычно используется евклидово расстояние (по умолчанию в DESeq2). • Оно вычисляется по нормализованным данным экспрессии (rlog() или vst()). • Чем меньше расстояние — тем более похожи образцы.
sampleDists <- dist(t(assay(rlt)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rlt$condition)
colnames(sampleDistMatrix) <- paste(rlt$condition)
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
plt <- pheatmap(sampleDistMatrix,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
color = colors)
plt
ggsave("./pictures_transpl/transpl_Plot of the distance between samples heatmap.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
res_humoral <- results(dds, contrast=c("condition", "no_complications", "humoral"))
res_humoral
log2 fold change (MLE): condition no_complications vs humoral
Wald test p-value: condition no_complications vs humoral
DataFrame with 297 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p 72441.408 0.0618842 0.288818 0.214267 0.8303389 0.939409
Hsa-Let-7-P1b_5p 2471.713 -0.9029022 0.495172 -1.823412 0.0682410 0.396094
Hsa-Let-7-P1c_5p 1586.095 -1.0840729 0.471749 -2.297986 0.0215626 0.246039
Hsa-Let-7-P2a1_3p* 106.535 0.0898204 0.728328 0.123324 0.9018504 0.974875
Hsa-Let-7-P2a2_3p* 117.677 -2.1053617 1.182746 -1.780062 0.0750658 0.409032
... ... ... ... ... ... ...
Hsa-Mir-95-P3_5p 11.6942 1.248042 1.169651 1.067020 0.2859626 NA
Hsa-Mir-96-P1_5p 452.3189 1.016073 0.712354 1.426361 0.1537643 0.6037509
Hsa-Mir-96-P2_5p 104065.5500 -0.309282 0.550995 -0.561315 0.5745829 0.8960804
Hsa-Mir-96-P3_3p* 27.6400 0.953532 0.804588 1.185117 0.2359712 0.7233923
Hsa-Mir-96-P3_5p 1433.3806 1.044344 0.364247 2.867132 0.0041421 0.0737293
MA plot Фильтрация точек с низким средним экспрессированием (по baseMean). • Обычно отсекаются baseMean < 1. 2. Определение значимых генов (синие точки): • Используется критерий padj < 0.1 по умолчанию, а не < 0.05!
tiff("./pictures_transpl/transpl_PlotMA_standart_padj_0.05_humoral.tiff",
width = 8, height = 6, units = "in", res = 300, bg = "white")
plotMA(res_humoral, alpha = 0.05, ylim = c(-8, 8))
dev.off()
null device
1
plotMA(res_humoral, alpha = 0.05, ylim = c(-8, 8))
Кастомный MA plot
res_df <- res_humoral %>%
as.data.frame() %>%
mutate(color = case_when(
padj < 0.05 ~ "padj < 0.05",
pvalue < 0.05 ~ "pvalue < 0.05",
TRUE ~ "All"
))
plt <- ggplot(res_df, aes(x = baseMean, y = log2FoldChange, color = color)) +
geom_point(alpha = 0.7, size = 1) +
geom_hline(yintercept = 0, linetype = "solid", color = "gray40", size = 1.5) +
scale_color_manual(values = c("All" = "gray70",
"pvalue < 0.05" = "blue",
"padj < 0.05" = "red")) +
scale_x_log10(labels = scales::scientific) +
theme_minimal() +
labs(x = "mean of normalized counts",
y = "log fold change",
color = NULL)
plt
ggsave("./pictures_transpl/transpl_Сustom MAplot_humoral.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Значимые результаты
58 генов (20%) имеют низкие уровни экспрессии и фильтруются из анализа. independent filtering — процедура, которая исключает гены с низкими значениями для увеличения статистической мощности.
signres_humoral <- results(dds, contrast=c("condition", "no_complications", "humoral"), alpha=0.05)
summary(signres_humoral)
out of 297 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 0, 0%
LFC < 0 (down) : 9, 3%
outliers [1] : 1, 0.34%
low counts [2] : 58, 20%
(mean count < 40)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Let’s arranged it by log2FoldChange:
order_indices <- order(-res_humoral$log2FoldChange)
res_humoral[order_indices, ]
log2 fold change (MLE): condition no_complications vs humoral
Wald test p-value: condition no_complications vs humoral
DataFrame with 297 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
Hsa-Mir-542_3p 179.1170 1.55776 0.661653 2.35434 0.0185557 0.225199
Hsa-Mir-190-P1_5p 19.6983 1.54017 1.120054 1.37509 0.1691041 NA
Hsa-Mir-874_3p 183.9741 1.53509 0.798423 1.92265 0.0545243 0.378167
Hsa-Mir-624_5p 18.9670 1.36865 1.076852 1.27097 0.2037388 NA
Hsa-Mir-95-P3_5p 11.6942 1.24804 1.169651 1.06702 0.2859626 NA
... ... ... ... ... ... ...
Hsa-Mir-873_3p 102.3872 -2.74537 1.229274 -2.23333 2.55274e-02 0.27263230
Hsa-Mir-10-P2a_5p 1162.4941 -2.85595 0.650038 -4.39350 1.11539e-05 0.00148904
Hsa-Mir-483_5p 144.9177 -3.86731 1.288810 -3.00068 2.69377e-03 0.06522233
Hsa-Mir-193-P1b_5p* 152.0458 -4.84885 0.982406 -4.93569 7.98689e-07 0.00021325
Hsa-Mir-193-P1b_3p 99.8971 -5.64893 1.644846 -3.43432 5.94044e-04 0.02643497
Visualisation for the first gene
#plotCounts(dds, gene=which.max(res_humoral$log2FoldChange), intgroup="condition")
plotCounts(dds, gene=which.min(res_humoral$padj), intgroup="condition")
#plotCounts(dds, gene=rownames(res)[which.min(res$padj[which.max(res$log2FoldChange)])], intgroup="condition")
Volcano plot
plt <- EnhancedVolcano(res_humoral,
lab = rownames(res_humoral),
x = "log2FoldChange",
y = "padj",
pCutoff = 0.05,
FCcutoff = 1,
labSize = 3.0,
boxedLabels = FALSE,
col = c('black', '#CBD5E8', '#B3E2CD', '#FDCDAC'),
colAlpha = 1,
title = NULL,
subtitle = NULL)
plt
ggsave("./pictures_transpl/transpl_VolcanoPlot_humoral.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
coldata_filtered <- coldata[coldata$condition %in% c("humoral", "no_complications"), ]
coldata_filtered
Plot a heatmap of diff expressed genes
res_sign_humoral <- subset(res_humoral, padj < 0.05 & !is.na(padj) & abs(log2FoldChange) > 1.0)
res_sign_humoral <- res_sign_humoral[order(res_sign_humoral$log2FoldChange, decreasing = TRUE), ]
sig_genes <- rownames(res_sign_humoral)
de_mat <- assay(rlt)[sig_genes, ]
de_mat_filtered <- de_mat[, coldata_filtered$sample]
#datamatrix <- t(scale(t(de_mat_filtered)))
datamatrix <- de_mat_filtered
annotation_col <- data.frame(condition = coldata_filtered$condition)
rownames(annotation_col) <- colnames(datamatrix)
annotation_colors <- list(
condition = c("no_complications" = "#FFCC00", "humoral" = "#3399FF"))
plt <- pheatmap(datamatrix,
cluster_rows = TRUE,
show_rownames = TRUE,
cluster_cols = TRUE,
annotation_col = annotation_col,
annotation_colors = annotation_colors,
display_numbers = FALSE,
legend = TRUE,
fontsize = 15)
ggsave("./pictures_transpl/transpl_Heatmap of diff expressed genes_humoral.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
up_humoral <- res_sign_humoral %>%
as.data.frame() %>%
filter(log2FoldChange > 0)
down_humoral <- res_sign_humoral %>%
as.data.frame() %>%
filter(log2FoldChange < 0)
rownames(up_humoral)
character(0)
rownames(down_humoral)
[1] "Hsa-Mir-146-P1_5p" "Hsa-Mir-425_3p*" "Hsa-Mir-148-P2_3p" "Hsa-Mir-10-P3c_5p" "Hsa-Mir-10-P2a_5p"
[6] "Hsa-Mir-193-P1b_5p*" "Hsa-Mir-193-P1b_3p"
Переводим в miRBase • miRBase: https://www.mirbase.org/ • MirGeneDB: https://mirgenedb.org/
url <- "https://mirgenedb.org/browse/hsa"
page <- read_html(url)
Парсим таблицу
mir_table <- page %>%
html_element("table") %>%
html_table(fill = TRUE)
mir_table <- mir_table[-c(1:3), c(1,2) ]
colnames(mir_table) <- c("MirGeneDB_ID", "MiRBase_ID")
mir_table$MirGeneDB_ID <- sub(" V", "", mir_table$MirGeneDB_ID)
head(mir_table)
down_humoral_clean <- sub("_.*", "", row.names(down_humoral))
down_humoral_converted <- mir_table$MiRBase_ID[match(down_humoral_clean, mir_table$MirGeneDB_ID)]
down_humoral_converted
[1] "hsa-mir-146b" "hsa-mir-425" NA "hsa-mir-125b-2" NA "hsa-mir-193a"
[7] "hsa-mir-193a"
Конвертация в MIMATID NA без соответствия удалила из анализа
NA Hsa-Mir-148-P2_3p есть три похожих соответствия: Hsa-Mir-148-P1
hsa-mir-148a
Hsa-Mir-148-P3 hsa-mir-152
Hsa-Mir-148-P4 hsa-mir-148b
NA Hsa-Mir-10-P2a_5p есть три похожих соответствие: Hsa-Mir-10-P2b
hsa-mir-99b
Hsa-Mir-10-P2c hsa-mir-99a
Hsa-Mir-10-P2d hsa-mir-100
[1] “Hsa-Mir-146-P1_5p” “Hsa-Mir-425_3p” ”Hsa-Mir-148-P2_3p”
”Hsa-Mir-10-P3c_5p” ”Hsa-Mir-10-P2a_5p”
[6] ”Hsa-Mir-193-P1b_5p” “Hsa-Mir-193-P1b_3p”
mirna_names_down <- c("hsa-miR-146b-5p", "hsa-miR-425-3p", "hsa-miR-125b-5p", "hsa-miR-193a-5p", "hsa-miR-193a-3p")
converted_mirna_down <- miRNAVersionConvert(mirna_names_down)
converted_mirna_down
Запрос таргетов из базы multiMiR
targets_humoral_down <- unique(get_multimir(org = "hsa", mirna = converted_mirna_down$Accession, table = "validated")@data$target_symbol)
Searching mirecords ...
Searching mirtarbase ...
Searching tarbase ...
#writeLines(targets_down, "targets_down150_list.txt")
Анализ обогащения из базы биологических процессов
#msig_go_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "GO:BP")
# targets_down <- readLines("targets_down150_list.txt")
# targets_up <- readLines("targets_up150_list.txt")
GO_enrich_down_humoral_bp <- enrichGO(
gene = targets_humoral_down,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05
)
Визуализация
p1 <- dotplot(GO_enrich_down_humoral_bp, showCategory = 20) +
ggtitle("GO Enrichment for DOWNregulated targets") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 11)
)
p1
ggsave("./pictures_transpl/transpl_GO_enrichment_dotplot_down_humoral_bp.tiff", plot = p1, width = 16, height = 10, dpi = 300)
GO_enrich_DOWN_humoral_BP <- enrichplot::pairwise_termsim(GO_enrich_down_humoral_bp, method = "JC")
plt <- emapplot(GO_enrich_DOWN_humoral_BP,
repel = TRUE,
showCategory = 20) +
ggtitle("Biological processes for DOWNregulated targets for humoral") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 3)
)
plt
ggsave("./pictures_transpl/transpl_GO_enrichment_emapplot_DOWN_humoral_BP.tiff", plot = plt, width = 16, height = 10, dpi = 300)
GO_enrich_UP150_BP <- enrichplot::pairwise_termsim(GO_enrich_up150_bp, method = "JC")
plt <- emapplot(GO_enrich_UP150_BP,
repel = TRUE,
showCategory = 20) +
ggtitle("Biological processes for UPregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 3)
)
plt
ggsave("./pictures/GO_enrichment_emapplot_BPup_type150.tiff", plot = plt, width = 16, height = 10, dpi = 300)
Анализ обогащения из базы IMMUNESIGDB IMMUNESIGDB: Наборы генов, связанные с иммунной системой, включая иммуно-онкологию и другие аспекты иммунитета.
msig_go_bp <- msigdbr(species = "Homo sapiens", category = "C7", subcategory = "IMMUNESIGDB")
GO_enrich_up150_immum <- enricher(gene = targets150_up, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
GO_enrich_down150_immun <- enricher(gene = targets150_down, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
Визуализация IMMUNESIGDB
p1 <- dotplot(GO_enrich_up150_immum, showCategory = 20) +
ggtitle("GO Enrichment IMMUNESIGD for UPregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p2 <- dotplot(GO_enrich_down150_immun, showCategory = 20) +
ggtitle("GO Enrichment IMMUNESIGD for DOWNregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p1 + p2
combined_plot <- p1 + p2
ggsave("./pictures/GO_enrichment_dotplot_IMMUNESIGD_type150.tiff", plot = combined_plot, width = 16, height = 10, dpi = 300)
Анализ обогащения из базы KEGG
msig_go_bp <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG")
GO_enrich_up150_KEGG <- enricher(gene = targets150_up, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
GO_enrich_down150_KEGG <- enricher(gene = targets150_down, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
p1 <- dotplot(GO_enrich_up150_KEGG, showCategory = 20) +
ggtitle("GO Enrichment KEGG for UPregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p2 <- dotplot(GO_enrich_down150_KEGG, showCategory = 20) +
ggtitle("GO Enrichment KEGG for DOWNregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p1 + p2
combined_plot <- p1 + p2
ggsave("./pictures/GO_enrichment_dotplot_KEGG_type150.tiff", plot = combined_plot, width = 16, height = 10, dpi = 300)
Анализ обогащения из базы CP:WIKIPATHWAYS
msig_go_bp <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:WIKIPATHWAYS")
GO_enrich_up150_WIKI <- enricher(gene = targets150_up, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
GO_enrich_down150_WIKI <- enricher(gene = targets150_down, TERM2GENE = msig_go_bp[, c("gs_name", "gene_symbol")])
p1 <- dotplot(GO_enrich_up150_WIKI, showCategory = 20) +
ggtitle("GO Enrichment WIKIPATHWAYS for UPregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p2 <- dotplot(GO_enrich_down150_WIKI, showCategory = 20) +
ggtitle("GO Enrichment WIKIPATHWAYS for DOWNregulated targets for vesicles 150") +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8)
)
p1 + p2
combined_plot <- p1 + p2
ggsave("./pictures/GO_enrichment_dotplot_WIKIPATHWAY_type150.tiff", plot = combined_plot, width = 16, height = 10, dpi = 300)
coldata_16 <- coldata[coldata$type == 16, ]
rownames(coldata_16) <- coldata_16$sample
coldata_16
common_samples_16 <- intersect(colnames(counts), coldata_16$samples)
counts_16 <- counts[, c(counts$miRNA, common_samples)]
counts_16 <- counts_16[, rownames(coldata_16)] #ранжирую по колонки в counts так же как и названия строк в coldata_150
head(counts_16)
Создаем DESeqDataSet из матрицы каунтов
dds_16 <- DESeqDataSetFromMatrix(countData = counts_16,
colData = coldata_16,
design = ~ 0 + patient + condition)
converting counts to integer mode
dds_16$condition <- relevel(dds_16$condition, ref = "before")
dds_16
class: DESeqDataSet
dim: 913 6
metadata(1): version
assays(1): counts
rownames(913): Hsa-Let-7-P1a_3p* Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p ... Hsa-Mir-9851_3p Hsa-Mir-9851_5p*
rowData names(0):
colnames(6): 29.1p16_S39_R1_001 15.1p16_S33_R1_001 ... 15.7p16_S31_R1_001 17.7p16_S37_R1_001
colData names(4): sample condition type patient
Фильтрация
dim(dds_16)
[1] 913 6
smallestGroupSize <- 3
keep <- rowSums(counts(dds_16) >= 10) >= smallestGroupSize
dds_16 <- dds_16[keep,]
dim(dds_16)
[1] 219 6
Run Differential Expression Analysis for 150 type
dds_16 <- DESeq(dds_16, fitType = "parametric")
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
dds_16
class: DESeqDataSet
dim: 219 6
metadata(1): version
assays(4): counts mu H cooks
rownames(219): Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p Hsa-Let-7-P1b_5p ... Hsa-Mir-96-P2_5p Hsa-Mir-96-P3_5p
rowData names(30): baseMean baseVar ... deviance maxCooks
colnames(6): 29.1p16_S39_R1_001 15.1p16_S33_R1_001 ... 15.7p16_S31_R1_001 17.7p16_S37_R1_001
colData names(5): sample condition type patient sizeFactor
plotDispEsts(dds_16)
res_16 <- results(dds_16, contrast=c("condition", "before", "after"))
res_16
log2 fold change (MLE): condition before vs after
Wald test p-value: condition before vs after
DataFrame with 219 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
Hsa-Let-7-P1a_5p/P2a1_5p/P2a2_5p 22993.7673 0.1901164 0.224536 0.846707 0.3971586 0.926559
Hsa-Let-7-P1b_5p 2527.3696 0.4961225 0.280481 1.768830 0.0769222 0.575168
Hsa-Let-7-P1c_5p 481.2148 0.5330152 0.461003 1.156209 0.2475958 0.860794
Hsa-Let-7-P2a1_3p* 23.7282 0.4135740 1.966411 0.210319 0.8334185 0.954993
Hsa-Let-7-P2a3_5p 45006.2426 -0.0642043 0.238029 -0.269733 0.7873659 0.940503
... ... ... ... ... ... ...
Hsa-Mir-92-P2c_5p* 25.3899 -0.754468 2.275831 -0.331513 0.7402570 0.931703
Hsa-Mir-95-P2_3p 193.6126 -0.340069 0.662909 -0.512995 0.6079550 0.926559
Hsa-Mir-96-P1_5p 14.4204 -4.216533 2.813830 -1.498503 0.1340025 0.764653
Hsa-Mir-96-P2_5p 4249.9579 -0.537344 0.268063 -2.004539 0.0450123 0.487244
Hsa-Mir-96-P3_5p 121.7907 0.504262 0.806362 0.625355 0.5317382 0.926559
MA plot
Фильтрация точек с низким средним экспрессированием (по baseMean). • Обычно отсекаются baseMean < 1. 2. Определение значимых генов (синие точки): • Используется критерий padj < 0.1 по умолчанию, а не < 0.05!
tiff("./pictures/PlotMA_standart_padj_0.05_type16.tiff",
width = 8, height = 6, units = "in", res = 300, bg = "white")
plotMA(res_16, alpha = 0.05, ylim = c(-8, 8))
dev.off()
null device
1
plotMA(res_16, alpha = 0.05, ylim = c(-8, 8))
Кастомный MA plot по p-value
res_df <- res_16 %>%
as.data.frame %>%
mutate(color = case_when(
pvalue < 0.05 & !is.na(pvalue) & abs(log2FoldChange) > 1 ~ "blue", # Значимые по p-value и диф экспрессированные
TRUE ~ "gray70"
))
plt <- ggplot(res_df, aes(x = baseMean, y = log2FoldChange, color = color)) +
geom_point(alpha = 0.7, size = 1) +
geom_hline(yintercept = 0, linetype = "solid", color = "gray40", size = 1.5) + # Добавляем линию
scale_color_manual(values = c("gray70" = "gray70", "blue" = "blue")) +
scale_x_log10(labels = scales::scientific) +
theme_minimal() +
labs(x = "mean of normalized counts", y = "log fold change") +
theme(legend.position = "none")
plt
ggsave("./pictures/PlotMA_castom_pvalue0.05_type16.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Значимые результаты
summary(results(dds_16, contrast=c("condition", "before", "after"), alpha=0.05))
out of 219 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 0, 0%
LFC < 0 (down) : 0, 0%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 8)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Let’s arranged it by log2FoldChange:
order_indices <- order(-res_16$log2FoldChange)
res_16[order_indices, ]
log2 fold change (MLE): condition before vs after
Wald test p-value: condition before vs after
DataFrame with 219 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
Hsa-Mir-185_5p 14.8310 5.85950 2.69426 2.17481 0.02964445 0.487244
Hsa-Mir-136_5p* 13.3826 5.83711 3.16104 1.84658 0.06480773 0.575168
Hsa-Mir-197_3p 45.5597 4.16951 1.59282 2.61769 0.00885267 0.357073
Hsa-Mir-101-P1_5p* 16.5354 4.08972 2.30004 1.77811 0.07538608 0.575168
Hsa-Mir-10-P3a_5p 37.9479 3.82488 1.69051 2.26256 0.02366292 0.487244
... ... ... ... ... ... ...
Hsa-Mir-154-P17_3p 15.3729 -4.07531 2.60884 -1.56211 0.1182617 0.764653
Hsa-Mir-190-P1_5p 10.5910 -4.10593 3.44201 -1.19289 0.2329137 0.850135
Hsa-Mir-96-P1_5p 14.4204 -4.21653 2.81383 -1.49850 0.1340025 0.764653
Hsa-Mir-128-P1_5p* 15.3208 -4.49032 3.05489 -1.46988 0.1415942 0.764653
Hsa-Mir-197_5p* 17.0795 -5.15605 2.93332 -1.75775 0.0787901 0.575168
Visualisation for the first gene
plotCounts(dds_16, gene=which.max(res_16$log2FoldChange), intgroup="condition")
plotCounts(dds_16, gene=which.min(res_16$pvalue), intgroup="condition")
#plotCounts(dds, gene=rownames(res)[which.min(res$padj[which.max(res$log2FoldChange)])], intgroup="condition")
Volcano plot
plt <- EnhancedVolcano(res_16,
lab = rownames(res_16),
x = "log2FoldChange",
y = "pvalue",
pCutoff = 0.05,
FCcutoff = 1,
labSize = 3.0,
boxedLabels = FALSE,
col = c('black', '#CBD5E8', '#B3E2CD', '#FDCDAC'),
colAlpha = 1,
title = NULL,
subtitle = NULL)
plt
ggsave("./pictures/Volcano plot_based_on_Pvalue_16type.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
rlog трансформация • Черные точки – стандартное отклонение отдельных генов. • Красная линия – сглаженный тренд зависимости SD от среднего значения экспрессии. • Если красная линия наклонена вверх → стандартное отклонение растёт с увеличением среднего (плохая нормализация). • Если красная линия примерно горизонтальна → нормализация сработала хорошо.
rlt_16 <- rlog(dds_16)
meanSdPlot(assay(rlt_16)) #показывает, как изменяется стандартное отклонение в зависимости от среднего значения экспрессии.
pcaData <- plotPCA(rlt_16, intgroup=c("condition", "patient"), returnData = TRUE)
using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, shape = patient, color = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "%")) +
ylab(paste0("PC2: ", percentVar[2], "%")) +
coord_fixed() +
theme_bw() +
scale_color_brewer(palette = "Set2")
assay(rlt_16) <- limma::removeBatchEffect(assay(rlt_16),
batch = colData(dds_16)[,'patient'])
pcaData <- plotPCA(rlt_16, intgroup=c("condition", "patient"), returnData = TRUE)
using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
plt <- ggplot(pcaData, aes(PC1, PC2, shape = patient, color = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "%")) +
ylab(paste0("PC2: ", percentVar[2], "%")) +
coord_fixed() +
theme_bw() +
scale_color_brewer(palette = "Set2")
plt
ggsave("./pictures/PCA plot for type 16 after removing donor effect.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Plot a heatmap of the most expressed genes
res_sign_16 <- subset(res_16, pvalue < 0.05 & !is.na(pvalue) & abs(log2FoldChange) > 1.0)
res_sign_16 <- res_sign_16[order(res_sign_16$log2FoldChange, decreasing = TRUE), ]
sig_genes <- rownames(res_sign_16) # Получаем имена генов, которые прошли фильтрацию
de_mat <- assay(rlt_16)[sig_genes, ]
datamatrix <- t(scale(t(de_mat)))
annotation_col <- data.frame(condition = coldata_16$condition)
rownames(annotation_col) <- colnames(datamatrix)
annotation_colors <- list(
condition = c("before" = "#FFCC00", "after" = "#3399FF")
)
plt <- pheatmap(datamatrix,
cluster_rows = TRUE,
show_rownames = TRUE,
cluster_cols = TRUE,
annotation_col = annotation_col,
annotation_colors = annotation_colors,
display_numbers = TRUE,
legend = FALSE,
fontsize = 15)
plt
ggsave("./pictures/Heatmap of diff expressed genes_type16.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
Plot of the distance between samples heatmap Расчет расстояний между образцами • Обычно используется евклидово расстояние (по умолчанию в DESeq2). • Оно вычисляется по нормализованным данным экспрессии (rlog() или vst()). • Чем меньше расстояние — тем более похожи образцы.
sampleDists_16 <- dist(t(assay(rlt_16)))
sampleDistMatrix_16 <- as.matrix(sampleDists_16)
rownames(sampleDistMatrix_16) <- paste(rlt_16$condition, rlt_16$patient, sep="_patient")
colnames(sampleDistMatrix_16) <- paste(rlt_16$condition, rlt_16$patient, sep="_patient")
plt <- pheatmap(sampleDistMatrix_16,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
fontsize = 12,
legend = FALSE,
display_numbers = TRUE,
color = colors)
plt
ggsave("./pictures/Plot of the distance between samples_type16.tiff", plot = plt, width = 8, height = 6, dpi = 300, bg = "white")
up_16 <- res_sign_16 %>%
as.data.frame() %>%
filter(log2FoldChange > 0)
down_16 <- res_sign_16 %>%
as.data.frame() %>%
filter(log2FoldChange < 0)
rownames(up_16)
[1] "Hsa-Mir-185_5p" "Hsa-Mir-197_3p" "Hsa-Mir-10-P3a_5p" "Hsa-Mir-148-P3_5p*" "Hsa-Mir-30-P1a_3p*" "Hsa-Mir-150_3p*"
[7] "Hsa-Mir-361_3p*" "Hsa-Mir-10-P2b_5p" "Hsa-Mir-126_3p*"
rownames(down_16)
[1] "Hsa-Mir-340_5p" "Hsa-Mir-181-P2c_5p" "Hsa-Mir-223_5p*" "Hsa-Mir-30-P1c_3p*"
Переводим в miRBase • miRBase: https://www.mirbase.org/ • MirGeneDB: https://mirgenedb.org/
url <- "https://mirgenedb.org/browse/hsa"
page <- read_html(url)
Парсим таблицу
mir_table <- page %>%
html_element("table") %>%
html_table(fill = TRUE)
mir_table <- mir_table[-c(1:3), c(1,2) ]
colnames(mir_table) <- c("MirGeneDB_ID", "MiRBase_ID")
mir_table$MirGeneDB_ID <- sub(" V", "", mir_table$MirGeneDB_ID)
head(mir_table)
up_16_clean <- sub("_.*", "", row.names(up_16))
up_16_converted <- mir_table$MiRBase_ID[match(up_16_clean, mir_table$MirGeneDB_ID)]
#down_16_clean <- sub("_.*", "", row.names(down_16))
down_16_converted <- mir_table$MiRBase_ID[match(down_16_clean, mir_table$MirGeneDB_ID)]
up_16_converted
[1] "hsa-mir-185" "hsa-mir-197" NA "hsa-mir-152" "hsa-mir-30d" "hsa-mir-150" "hsa-mir-361" "hsa-mir-99b" NA
down_16_converted
[1] "hsa-mir-340" "hsa-mir-181d" "hsa-mir-223" NA
Конвертация в MIMATID в итоге заменила NA вручную на самые близкие, но это такая себе практика
NA Hsa-Mir-10-P3a_5p есть два соответствия: Hsa-Mir-10-P1c = hsa-mir-10a Hsa-Mir-10-P3b = hsa-mir-125a
NA Hsa-Mir-126_3p* есть одно соответствие: Hsa-Mir-126-P2 = hsa-mir-126
NA Hsa-Mir-30-P1c_3p* есть три соответствия: Hsa-Mir-30-P1a = hsa-mir-30d Hsa-Mir-30-P1b = hsa-mir-30a Hsa-Mir-30-P1d = hsa-mir-30e
[1] “Hsa-Mir-185_5p” “Hsa-Mir-197_3p” “Hsa-Mir-10-P3a_5p”
“Hsa-Mir-148-P3_5p” ”Hsa-Mir-30-P1a_3p”
“Hsa-Mir-150_3p”
[7] ”Hsa-Mir-361_3p” “Hsa-Mir-10-P2b_5p” “Hsa-Mir-126_3p”
[1] ”Hsa-Mir-340_5p” ”Hsa-Mir-181-P2c_5p” ”Hsa-Mir-223_5p”
“Hsa-Mir-30-P1c_3p*”
MI (MicroRNA Gene ID) — это идентификатор предшественника (precursor) miRNA MIMAT (Mature miRNA ID) — это идентификатор зрелой (mature) miRNA, которая функционирует в клетке
up_16_converted <- c("hsa-mir-185-5p", "hsa-mir-197-3p", "hsa-mir-152-5p", "hsa-mir-30d-3p", "hsa-mir-150-3p", "hsa-mir-361-3p", "hsa-mir-99b-5p", "hsa-mir-126-3p")
down_16_converted <- c("hsa-mir-340-5p", "hsa-mir-181d-5p", "hsa-mir-223-5p")
# up_16_converted <- c("hsa-mir-185", "hsa-mir-197", "hsa-mir-152", "hsa-mir-30d", "hsa-mir-150", "hsa-mir-361", "hsa-mir-99b", "hsa-mir-126")
# down_16_converted <- c("hsa-mir-340", "hsa-mir-181d", "hsa-mir-223") #но есть только MI, а не MIMAT
converted_mirna_up16 <- miRNAVersionConvert(up_16_converted)
converted_mirna_down16 <- miRNAVersionConvert(down_16_converted)
converted_mirna_up16
converted_mirna_down16
Запрос таргетов из базы multiMiR
targets16_up <- unique(get_multimir(org = "hsa", mirna = converted_mirna_up16$Accession, table = "predicted")@data$target_symbol)
targets16_up
targets16_down <- unique(get_multimir(org = "hsa", mirna = converted_mirna_down16$Accession, table = "predicted")@data$target_symbol)
targets16_down